1 Setup

2 Load data

# Read in count matrix 
counts <- read.delim("data/rsem.merged.gene_counts.tsv") %>% 
  mutate(gene_id = str_remove(gene_id, "\\..+")) %>% 
  select(-transcript_id.s.) %>%
  filter(!duplicated(gene_id)) %>%
  column_to_rownames("gene_id")

# Map ENSEMBL into symbols
symb <- clusterProfiler::bitr(rownames(counts), 
                              fromType = "ENSEMBL", toType = "SYMBOL", 
                              OrgDb = org.Hs.eg.db)
counts <- counts[symb$ENSEMBL, ]


# Read in metadata
meta <- read_excel("data/std_meta_Aus.xlsx")

3 Tidy data

# Set same names for sample names in the metadata and the count matrix 
counts <- counts[meta$sample] # same order
rownames(meta) <- meta$sample
counts <- counts[meta$sample]
#all(colnames(counts) == rownames(meta))

4 Create DESeq2 Object

# Create DESeq2 object 
meta <- meta %>% mutate(diagnosis = str_remove(diagnosis, " |-"),
                        diagnosis = factor(diagnosis), 
                        lobe = factor(lobe))
dds <- DESeqDataSetFromMatrix(countData = round(counts), 
                              colData = meta, 
                              design = ~ diagnosis + lobe)

# Prefilter
keep <- rowSums(counts(dds) >= 10) >= 5
dds <- dds[keep,]

5 Differential Expression Test

The design was specified when creating the object.

dds <- DESeq(dds)

Extract normalized/transformed counts:

norm <- counts(dds, normalized = TRUE)

# Export normalized counts 
write.csv(norm, "results/normalized_counts_DESeq2.csv", row.names = TRUE)

# Transformed variance: - for visualization
vsd <- vst(dds, blind = FALSE)

6 Results

6.1 Sample distances

vsd <- vsd[, vsd$diagnosis %in% c("Control", "FCDIIb")]

# Get sample-to-sample distances 
dist <- dist(t(assay(vsd)))

# Distance matrix 
sampleDistMatrix <- as.matrix( dist )
#colnames(sampleDistMatrix) <- NULL

colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

annotation_rows = data.frame(
    Diagnosis = vsd$diagnosis
)
rownames(annotation_rows) <- row.names(sampleDistMatrix)

ann_colors = list(
  Diagnosis = c("Control" = "darkgreen", "FCDIIb" = "orange")
)

pheatmap(sampleDistMatrix,
         clustering_distance_rows=dist,
         clustering_distance_cols=dist,
         annotation_row = NULL,
         annotation_col = annotation_rows,
         show_colnames = F,
         show_rownames = F,
         annotation_colors = ann_colors,
         col=colors,
         clustering_method = "complete"
) 

# Export as svg 
ggsave("results/sample_distance.svg",
       width = 6, 
       height = 4, 
       dpi = 300)

6.2 FCD IIb vs. Non-Epileptic Controls

res <- results(dds, contrast = c("diagnosis", "FCDIIb", "Control"))

All genes:

summary(res)
## 
## out of 20987 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 892, 4.3%
## LFC < 0 (down)     : 2684, 13%
## outliers [1]       : 8, 0.038%
## low counts [2]     : 6100, 29%
## (mean count < 126)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#resultsNames(dds)

res <- res %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>% 
  left_join(symb, by = c("gene" = "ENSEMBL")) %>% 
  select(gene, SYMBOL, log2FoldChange, pvalue, padj) %>% 
  mutate(across(c("log2FoldChange", padj), round, digits = 4))
DT::datatable(res, options = list(ordering = FALSE), filter = "top")

Significant changes (FC > 1.5):

sig <- res %>% 
  filter(padj < 0.1, abs(log2FoldChange) > log2(1.5)) %>% 
  arrange(desc(log2FoldChange))

sum(sig$log2FoldChange > 0)
## [1] 755
DT::datatable(sig, options = list(ordering = FALSE), filter = "top")

Export list:

setwd("~/Library/CloudStorage/GoogleDrive-icottagalvao@gmail.com/My Drive/Doutorado/Disciplinas/MO413 - Ciência de dados e visualização em saúde/ProjetoFinal")

# All genes with fold changes 
write.csv(res, "results/all_DEGs_FCDIIbvsControl.csv", row.names = FALSE)

# Significant genes 
write.csv(sig, "results/significant_DEGs_FCDIIbvsControl.csv", row.names = FALSE)

6.2.1 Visualization

Volcano plot:

# Create new column if up/down regulated 
res <- res %>% 
  mutate(Status = case_when(log2FoldChange > log2(1.5) & padj < 0.1 ~ "Up-regulated", log2FoldChange < -log2(1.5) & padj < 0.1 ~ "Down-regulated", TRUE ~ "Not significant"),
         label = ifelse((padj < 0.1 & abs(log2FoldChange) > 2.5), SYMBOL, "")) 

ggplot(res, aes(x = log2FoldChange, y = -log10(padj), color = Status, label = label)) + 
  geom_vline(xintercept = c(-log2(1.5), log2(1.5)), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.1), col = "gray", linetype = 'dashed') + 
  geom_point() + 
  ggrepel::geom_text_repel(size = 2, color = "black")+
  ylim(0, 5) + 
  scale_color_manual(values = c("Up-regulated"="red", "Down-regulated"="blue", "Not significant" = "black")) + 
  theme_classic() 

# Export as svg 
ggsave("results/volcano_plot.svg",
       width = 6, 
       height = 4, 
       dpi = 300)
# Get just significantly up-regulated genes 
up <- sig %>% 
  filter(log2FoldChange > 0, !is.na(SYMBOL))

write.csv(up, 
          "results/significantly_upregulated_genes.csv",
          row.names = FALSE)

6.2.2 Functional Enrichment

ego <- enrichGO(up$SYMBOL, 
             OrgDb = org.Hs.eg.db,
             ont = "BP", 
             keyType = "SYMBOL",
             pvalueCutoff = 0.05)

All functions:

# All functions
funs <- ego@result %>% 
  as.data.frame %>% 
  select(Description, GeneRatio, geneID)
DT::datatable(funs, options = list(ordering = FALSE), filter = "top")

Simplified:

ego.simp <- clusterProfiler::simplify(ego, cutoff = 0.5)
funs.simp <- ego.simp@result %>% 
  as.data.frame %>% 
  select(Description, GeneRatio, geneID) 
DT::datatable(funs.simp, options = list(ordering = FALSE))
# Export simplified functions table 
write.csv(funs.simp, 
          "results/functions.simp.csv")

7 Reproducibility

This document was last rendered on: 2024-05-17 19:00:02.599245.

Session Info
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Sao_Paulo
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pheatmap_1.0.12             RColorBrewer_1.1-3         
##  [3] readxl_1.4.3                lubridate_1.9.3            
##  [5] forcats_1.0.0               stringr_1.5.1              
##  [7] dplyr_1.1.4                 purrr_1.0.2                
##  [9] readr_2.1.5                 tidyr_1.3.1                
## [11] tibble_3.2.1                ggplot2_3.5.0              
## [13] tidyverse_2.0.0             clusterProfiler_4.10.1     
## [15] DESeq2_1.42.1               SummarizedExperiment_1.32.0
## [17] MatrixGenerics_1.14.0       matrixStats_1.2.0          
## [19] org.Hs.eg.db_3.18.0         edgeR_4.0.16               
## [21] limma_3.58.1                GenomicFeatures_1.54.4     
## [23] AnnotationDbi_1.64.1        Biobase_2.62.0             
## [25] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
## [27] IRanges_2.36.0              S4Vectors_0.40.2           
## [29] BiocGenerics_0.48.1        
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.3            BiocIO_1.12.0            bitops_1.0-7            
##   [4] ggplotify_0.1.2          filelock_1.0.3           cellranger_1.1.0        
##   [7] polyclip_1.10-6          XML_3.99-0.16.1          lifecycle_1.0.4         
##  [10] lattice_0.22-6           MASS_7.3-60.0.1          crosstalk_1.2.1         
##  [13] magrittr_2.0.3           sass_0.4.9               rmarkdown_2.26          
##  [16] jquerylib_0.1.4          yaml_2.3.8               cowplot_1.1.3           
##  [19] DBI_1.2.2                abind_1.4-5              zlibbioc_1.48.2         
##  [22] ggraph_2.2.1             RCurl_1.98-1.14          yulab.utils_0.1.4       
##  [25] tweenr_2.0.3             rappdirs_0.3.3           GenomeInfoDbData_1.2.11 
##  [28] enrichplot_1.22.0        ggrepel_0.9.5            tidytree_0.4.6          
##  [31] svglite_2.1.3            codetools_0.2-20         DelayedArray_0.28.0     
##  [34] DT_0.33                  DOSE_3.28.2              xml2_1.3.6              
##  [37] ggforce_0.4.2            tidyselect_1.2.1         aplot_0.2.2             
##  [40] farver_2.1.1             viridis_0.6.5            BiocFileCache_2.10.2    
##  [43] GenomicAlignments_1.38.2 jsonlite_1.8.8           tidygraph_1.3.1         
##  [46] systemfonts_1.0.6        tools_4.3.3              progress_1.2.3          
##  [49] ragg_1.3.0               treeio_1.26.0            Rcpp_1.0.12             
##  [52] glue_1.7.0               gridExtra_2.3            SparseArray_1.2.4       
##  [55] xfun_0.43                qvalue_2.34.0            withr_3.0.0             
##  [58] fastmap_1.1.1            fansi_1.0.6              digest_0.6.35           
##  [61] timechange_0.3.0         R6_2.5.1                 gridGraphics_0.5-1      
##  [64] textshaping_0.3.7        colorspace_2.1-0         GO.db_3.18.0            
##  [67] biomaRt_2.58.2           RSQLite_2.3.6            utf8_1.2.4              
##  [70] generics_0.1.3           data.table_1.15.4        rtracklayer_1.62.0      
##  [73] htmlwidgets_1.6.4        prettyunits_1.2.0        graphlayouts_1.1.1      
##  [76] httr_1.4.7               S4Arrays_1.2.1           scatterpie_0.2.2        
##  [79] pkgconfig_2.0.3          gtable_0.3.4             blob_1.2.4              
##  [82] XVector_0.42.0           shadowtext_0.1.3         htmltools_0.5.8.1       
##  [85] fgsea_1.28.0             scales_1.3.0             png_0.1-8               
##  [88] ggfun_0.1.4              knitr_1.45               rstudioapi_0.16.0       
##  [91] tzdb_0.4.0               reshape2_1.4.4           rjson_0.2.21            
##  [94] nlme_3.1-164             curl_5.2.1               cachem_1.0.8            
##  [97] parallel_4.3.3           HDO.db_0.99.1            restfulr_0.0.15         
## [100] pillar_1.9.0             vctrs_0.6.5              dbplyr_2.5.0            
## [103] evaluate_0.23            cli_3.6.2                locfit_1.5-9.9          
## [106] compiler_4.3.3           Rsamtools_2.18.0         rlang_1.1.3             
## [109] crayon_1.5.2             labeling_0.4.3           plyr_1.8.9              
## [112] fs_1.6.3                 stringi_1.8.3            viridisLite_0.4.2       
## [115] BiocParallel_1.36.0      munsell_0.5.1            Biostrings_2.70.3       
## [118] lazyeval_0.2.2           GOSemSim_2.28.1          Matrix_1.6-5            
## [121] hms_1.1.3                patchwork_1.2.0          bit64_4.0.5             
## [124] KEGGREST_1.42.0          statmod_1.5.0            highr_0.10              
## [127] igraph_2.0.3             memoise_2.0.1            bslib_0.7.0             
## [130] ggtree_3.10.1            fastmatch_1.1-4          bit_4.0.5               
## [133] ape_5.7-1                gson_0.1.0